data_dir = "../../data/Zheng_2021/"
meta = fread(paste0(data_dir, "GSE156728_metadata.txt.gz"))
dim(meta)## [1] 184455 7
meta[1:5,]## cellID cancerType patient libraryID loc meta.cluster
## 1: CHOL.NTH103.0216 CHOL CHOL.P0216 CHOL.NTH103-0216 N CD4.c10.Tm.CAPG
## 2: CHOL.NTH106.0216 CHOL CHOL.P0216 CHOL.NTH106-0216 N CD4.c12.Tem.GZMK
## 3: CHOL.NTH107.0216 CHOL CHOL.P0216 CHOL.NTH107-0216 N CD4.c12.Tem.GZMK
## 4: CHOL.NTH108.0216 CHOL CHOL.P0216 CHOL.NTH108-0216 N CD4.c03.Tn.ADSL
## 5: CHOL.NTH111.0216 CHOL CHOL.P0216 CHOL.NTH111-0216 N CD4.c06.Tm.ANXA1
## platform
## 1: SmartSeq2
## 2: SmartSeq2
## 3: SmartSeq2
## 4: SmartSeq2
## 5: SmartSeq2
table(meta$meta.cluster)##
## CD4.c01.Tn.TCF7 CD4.c02.Tn.PASK CD4.c03.Tn.ADSL
## 10443 785 408
## CD4.c04.Tn.il7r CD4.c05.Tm.TNF CD4.c06.Tm.ANXA1
## 534 1114 12506
## CD4.c07.Tm.ANXA2 CD4.c08.Tm.CREM CD4.c09.Tm.CCL5
## 1763 1677 1580
## CD4.c10.Tm.CAPG CD4.c11.Tm.GZMA CD4.c12.Tem.GZMK
## 2760 1540 2956
## CD4.c13.Temra.CX3CR1 CD4.c14.Th17.SLC4A10 CD4.c15.Th17.IL23R
## 2899 1847 2469
## CD4.c16.Tfh.CXCR5 CD4.c17.TfhTh1.CXCL13 CD4.c18.Treg.RTKN2
## 5925 1809 1809
## CD4.c19.Treg.S1PR1 CD4.c20.Treg.TNFRSF9 CD4.c21.Treg.OAS1
## 1398 15277 694
## CD4.c22.ISG.IFIT1 CD4.c23.Mix.NME1 CD4.c24.Mix.NME2
## 1167 708 998
## CD8.c01.Tn.MAL CD8.c02.Tm.IL7R CD8.c03.Tm.RPS12
## 6892 15501 4485
## CD8.c04.Tm.CD52 CD8.c05.Tem.CXCR5 CD8.c06.Tem.GZMK
## 6546 17009 4624
## CD8.c07.Temra.CX3CR1 CD8.c08.Tk.TYROBP CD8.c09.Tk.KIR2DL4
## 11799 938 2351
## CD8.c10.Trm.ZNF683 CD8.c11.Tex.PDCD1 CD8.c12.Tex.CXCL13
## 18355 2925 10146
## CD8.c13.Tex.myl12a CD8.c14.Tex.TCF7 CD8.c15.ISG.IFIT1
## 454 759 2515
## CD8.c16.MAIT.SLC4A10 CD8.c17.Tm.NME1
## 3497 593
table(meta$loc)##
## N P T
## 69079 8375 107001
table(meta$platform)##
## 10X SmartSeq2
## 183913 542
table(meta$cancerType)##
## BC BCL CHOL ESCA FTC MM OV PACA RC THCA UCEC
## 7354 7719 542 24884 1037 12274 4523 9860 26649 56958 32655
table(meta$platform, meta$loc)##
## N P T
## 10X 68918 8221 106774
## SmartSeq2 161 154 227
table(meta$platform, meta$cancerType)##
## BC BCL CHOL ESCA FTC MM OV PACA RC THCA UCEC
## 10X 7354 7719 0 24884 1037 12274 4523 9860 26649 56958 32655
## SmartSeq2 0 0 542 0 0 0 0 0 0 0 0
Keep those cells from 10X
w2kp = which(meta$platform == "10X")
meta = meta[w2kp,]
dim(meta)## [1] 183913 7
anyDuplicated(meta$cellID)## [1] 0
table(meta$loc)##
## N P T
## 68918 8221 106774
table(meta$platform)##
## 10X
## 183913
table(meta$cancerType)##
## BC BCL ESCA FTC MM OV PACA RC THCA UCEC
## 7354 7719 24884 1037 12274 4523 9860 26649 56958 32655
tcr = fread(paste0(data_dir, "GSE156728_10X_VDJ.merge.txt.gz"))
dim(tcr)## [1] 442618 16
tcr[1:2,]## barcode is_cell high_confidence length chain v_gene d_gene
## 1: AAACGGGAGCCACCTG.1 TRUE TRUE 705 TRB TRBV20-1 TRBD1
## 2: AAACGGGAGCCACCTG.1 TRUE TRUE 494 TRB None None
## j_gene c_gene full_length productive cdr3
## 1: TRBJ1-5 TRBC1 TRUE True CSAKKQGSNQPQHF
## 2: TRBJ1-5 TRBC1 FALSE None None
## cdr3_nt reads umis library.id
## 1: TGCAGTGCCAAAAAACAGGGTTCCAATCAGCCCCAGCATTTT 103056 25 BC-P20190403-N
## 2: None 31683 7 BC-P20190403-N
table(tcr$is_cell)##
## TRUE
## 442618
table(tcr$high_confidence)##
## TRUE
## 442618
table(tcr$chain)##
## IGH IGK IGL Multi None TRA TRB TRD TRG
## 405 26 140 36674 593 193945 210643 65 127
table(tcr$full_length)##
## FALSE TRUE
## 75313 367305
table(tcr$productive)##
## False None True
## 5270 92729 344619
table(tcr$productive, tcr$cdr3=="None")##
## FALSE TRUE
## False 5270 0
## None 289 92440
## True 344619 0
table(tcr$full_length, tcr$productive)##
## False None True
## FALSE 41 75272 0
## TRUE 5229 17457 344619
table(tcr$chain,tcr$productive)##
## False None True
## IGH 0 405 0
## IGK 0 26 0
## IGL 0 140 0
## Multi 133 35342 1199
## None 0 593 0
## TRA 3454 21762 168729
## TRB 1683 34269 174691
## TRD 0 65 0
## TRG 0 127 0
Keep those productive TCRs, check CDR3 length, and the overlap of TCR data with gene expression data. Each record is one TCR chain and thus many cells have two or multiple records.
tcr = tcr[tcr$productive == "True",]
dim(tcr)## [1] 344619 16
ggplot(tcr, aes(x=length))+
geom_histogram(color="darkblue", fill="lightblue")## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tcr[,`:=`(cdr3_length = nchar(tcr$cdr3))]
tb1 = table(tcr$cdr3_length)
tb1 = as.data.frame(tb1)
tb1$Relative_Freq = tb1$Freq/sum(tb1$Freq)
names(tb1)[1] = "CDR3_length"
ggplot(tb1, aes(x=CDR3_length, y=Relative_Freq)) +
geom_bar(stat="identity", color="black", fill="lightblue")tcr[,`:=`(key = paste(library.id, barcode, sep=":"))]
meta[,`:=`(key = paste(libraryID, cellID, sep=":"))]
table(tcr$key %in% meta$key)##
## TRUE
## 344619
table(meta$key %in% tcr$key)##
## FALSE TRUE
## 13032 170881
table(table(tcr$key))##
## 1 2 3 4 5 6 7 8 9 10 11
## 21755 128451 17656 2369 500 88 28 21 8 1 4
Remove those with chain = 'multi', since they have VDJC genes as a mixture from multiple chains. Then generate a tcr_beta to only keep one beta chain per cell, with cdr3 length from 12 to 16, and cdr3 starts with C and ends with F. If one cell has multiple beta chain, choose the one with largest umis.
table(tcr$chain)##
## Multi TRA TRB
## 1199 168729 174691
tcr[tcr$chain == "Multi",][1:2,]## barcode is_cell high_confidence length chain v_gene d_gene
## 1: AAGTCTGTCCGAATGT.1 TRUE TRUE 502 Multi TRDV1 None
## 2: AGCGTCGCACCAGATT.1 TRUE TRUE 1201 Multi TRAV13-1 None
## j_gene c_gene full_length productive cdr3
## 1: TRAJ20 TRAC TRUE True CALGELGYKLSF
## 2: TRAJ45 IGHA2 TRUE True CAASNSGGGADGLTF
## cdr3_nt reads umis library.id
## 1: TGTGCTCTTGGGGAACTAGGGTACAAGCTCAGCTTT 9247 2 BC-P20190403-N
## 2: TGTGCAGCATCAAATTCAGGAGGAGGTGCTGACGGACTCACCTTT 5287 2 BC-P20190403-N
## cdr3_length key
## 1: 12 BC-P20190403-N:AAGTCTGTCCGAATGT.1
## 2: 15 BC-P20190403-N:AGCGTCGCACCAGATT.1
tcr = tcr[tcr$chain != 'Multi',]
dim(tcr)## [1] 343420 18
table(tcr$chain)##
## TRA TRB
## 168729 174691
tcr_beta = tcr[tcr$chain == "TRB"]
dim(tcr_beta)## [1] 174691 18
tcr_beta = tcr_beta[which(tcr_beta$cdr3_length >= 12 &
tcr_beta$cdr3_length <= 16),]
dim(tcr_beta)## [1] 149044 18
tcr_beta = tcr_beta[str_sub(tcr_beta$cdr3, 1, 1)=="C",]
dim(tcr_beta)## [1] 149044 18
tcr_beta = tcr_beta[str_sub(tcr_beta$cdr3, -1)=="F",]
dim(tcr_beta)## [1] 148864 18
t1 = table(tcr_beta$barcode)
table(t1)## t1
## 1 2 3 4 5 6
## 138486 4879 162 28 2 2
tcr_beta[tcr_beta$barcode == names(t1)[t1==6][1],]## barcode is_cell high_confidence length chain v_gene d_gene
## 1: ATGAGGGCATAACCTG.31 TRUE TRUE 696 TRB TRBV20-1 None
## 2: ATGAGGGCATAACCTG.31 TRUE TRUE 856 TRB TRBV5-6 TRBD2
## 3: ATGAGGGCATAACCTG.31 TRUE TRUE 662 TRB TRBV4-1 TRBD1
## 4: ATGAGGGCATAACCTG.31 TRUE TRUE 692 TRB TRBV25-1 TRBD1
## 5: ATGAGGGCATAACCTG.31 TRUE TRUE 913 TRB TRBV5-1 TRBD1
## 6: ATGAGGGCATAACCTG.31 TRUE TRUE 649 TRB TRBV28 TRBD2
## j_gene c_gene full_length productive cdr3
## 1: TRBJ2-1 TRBC2 TRUE True CSARRADYNEQFF
## 2: TRBJ2-2 TRBC2 TRUE True CASSFYGGARTGELFF
## 3: TRBJ1-1 TRBC1 TRUE True CASSQDLRVTEAFF
## 4: TRBJ2-1 TRBC2 TRUE True CASSPYAGQQFF
## 5: TRBJ1-5 TRBC1 TRUE True CASSLEGDRPQHF
## 6: TRBJ2-7 TRBC2 TRUE True CASSSLSGASYEQYF
## cdr3_nt reads umis library.id
## 1: TGCAGTGCTAGAAGGGCGGACTACAATGAGCAGTTCTTC 27161 43 MM-P20190322-T
## 2: TGTGCCAGCAGCTTCTACGGGGGGGCTCGGACCGGGGAGCTGTTTTTT 19703 26 MM-P20190322-T
## 3: TGCGCCAGCAGCCAAGATCTCAGGGTGACTGAAGCTTTCTTT 18431 15 MM-P20190322-T
## 4: TGTGCCAGCAGTCCCTACGCAGGGCAGCAGTTCTTC 14393 13 MM-P20190322-T
## 5: TGCGCCAGCAGCTTGGAAGGGGATCGGCCCCAGCATTTT 16695 22 MM-P20190322-T
## 6: TGTGCCAGCAGTTCCCTTAGCGGGGCCTCCTACGAGCAGTACTTC 9073 10 MM-P20190322-T
## cdr3_length key
## 1: 13 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 2: 16 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 3: 14 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 4: 12 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 5: 13 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 6: 15 MM-P20190322-T:ATGAGGGCATAACCTG.31
tcr_beta = tcr_beta[order(tcr_beta$barcode, -tcr_beta$umis),]
u_barcode = unique(tcr_beta$barcode)
tcr_beta = tcr_beta[match(u_barcode,tcr_beta$barcode),]
dim(tcr_beta)## [1] 143559 18
anyDuplicated(tcr_beta$barcode)## [1] 0
tcr_beta[tcr_beta$barcode == names(t1)[t1==6][1],]## barcode is_cell high_confidence length chain v_gene d_gene
## 1: ATGAGGGCATAACCTG.31 TRUE TRUE 696 TRB TRBV20-1 None
## j_gene c_gene full_length productive cdr3
## 1: TRBJ2-1 TRBC2 TRUE True CSARRADYNEQFF
## cdr3_nt reads umis library.id
## 1: TGCAGTGCTAGAAGGGCGGACTACAATGAGCAGTTCTTC 27161 43 MM-P20190322-T
## cdr3_length key
## 1: 13 MM-P20190322-T:ATGAGGGCATAACCTG.31
fwrite(tcr_beta, file="../../data/Zheng_2021_processed/TCR_beta.txt",
sep="\t")
system("gzip -f ../../data/Zheng_2021_processed/TCR_beta.txt")These are lung cancer specific signatures.
CD8_Hanada = scan(file="../../data/Hanada_2022/CD8_signature.txt",
what=character(0))
CD8_Hanada## [1] "CXCL13+" "ENTPD1+" "BATF+" "GZMB+" "CD27+" "TIGIT+"
## [7] "PHLDA1+" "CD74+" "HLA-DMA+" "HLA-DRA+" "HLA-DRB1+" "HLA-DPB1+"
## [13] "CD3D+" "CD82+" "ARL3+" "HMOX1+" "ALOX5AP+" "DUSP4+"
## [19] "CARS+" "LSP1+" "CCND2+" "TPI1+" "GAPDH+" "ITM2A+"
## [25] "HMGN3+" "CHST12+" "NAP1L4+" "IL7R-" "TPT1-" "RPS12-"
## [31] "RPS16-" "S100A10-"
Hanada = list(CD8=gsub("(\\+|-)$", "", CD8_Hanada),
CD8_sign=str_extract(CD8_Hanada, pattern="(\\+|-)$"))
Hanada## $CD8
## [1] "CXCL13" "ENTPD1" "BATF" "GZMB" "CD27" "TIGIT"
## [7] "PHLDA1" "CD74" "HLA-DMA" "HLA-DRA" "HLA-DRB1" "HLA-DPB1"
## [13] "CD3D" "CD82" "ARL3" "HMOX1" "ALOX5AP" "DUSP4"
## [19] "CARS" "LSP1" "CCND2" "TPI1" "GAPDH" "ITM2A"
## [25] "HMGN3" "CHST12" "NAP1L4" "IL7R" "TPT1" "RPS12"
## [31] "RPS16" "S100A10"
##
## $CD8_sign
## [1] "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+"
## [20] "+" "+" "+" "+" "+" "+" "+" "+" "-" "-" "-" "-" "-"
Here we read the tumor-specific and virus specific DE genes for CD8 T cells.
Oliveira = read_excel("../../data/Oliveira_2021/41586_2021_3704_MOESM3_ESM.xlsx",
sheet="Supplementary Table 6", skip=9)
dim(Oliveira)## [1] 482 5
Oliveira[1:2,]## # A tibble: 2 × 5
## gene avg_logFC p_val p_val_adj Signature
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 KRT86 2.79 0 0 Tumor-specific
## 2 RDH10 2.25 0 0 Tumor-specific
table(Oliveira$Signature)##
## Tumor-specific Virus-specific
## 74 26
Oliveira_tumor = Oliveira$gene[which(Oliveira$Signature== "Tumor-specific")]
Oliveira_virus = Oliveira$gene[which(Oliveira$Signature== "Virus-specific")]
length(Oliveira_tumor)## [1] 74
length(Oliveira_virus)## [1] 26
Lowery et al. (2022) not only generated their own gene signatures, but also compiled a long list of signatures from other studies. Here we first check their own signatures.
clean_list <- function(x){
for(i in 1:length(x)){
x[[i]] = as.character(na.omit(x[[i]]))
}
x
}
sig_Lowery = read_excel("../../data/Lowery_2022/science.abl5447_table_s4.xlsx",
sheet="Signatures from this study")
dim(sig_Lowery)## [1] 243 14
sig_Lowery[1:2,]## # A tibble: 2 × 14
## `Cluster 0` `Cluster 1` `Cluster 2` `Cluster 3` `Cluster 4` `Cluster 5`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 RPS4Y1 CXCL13 LGALS1 CREM ZNF683 EGR1
## 2 TUBA1B TIGIT ANXA1 SNHG12 XCL2 DUSP2
## # … with 8 more variables: Cluster 6 <chr>, Cluster 7 <chr>, Cluster 8 <chr>,
## # Cluster 9 <chr>, Cluster 10 <chr>, Cluster 11 <chr>, NeoTCR4 <chr>,
## # NeoTCR8 <chr>
Lowery = list(CD8 = sig_Lowery$NeoTCR8)
Lowery = clean_list(Lowery)
sapply(Lowery, length)## CD8
## 243
NeoT8 = read_excel("../../data/Lowery_2022/science.abl5447_table_s8.xlsx",
sheet="NeoTCR8")
dim(NeoT8)## [1] 368 7
NeoT8[1:2,]## # A tibble: 2 × 7
## Gene p_val avg_logFC pct.1 pct.2 p_val_adj cluster
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 ATP10D 2.26e-216 1.21 0.373 0.032 3.27e-212 CD8_NeoTCR
## 2 GZMB 3.69e-181 1.19 0.668 0.109 5.35e-177 CD8_NeoTCR
table(NeoT8$avg_logFC < 0, NeoT8$p_val_adj < 0.05)##
## FALSE TRUE
## FALSE 0 244
## TRUE 25 99
table(Lowery$CD8 %in% NeoT8$Gene)##
## TRUE
## 243
wmat = match(Lowery$CD8, NeoT8$Gene)
max(NeoT8$p_val_adj[wmat])## [1] 0.0237495
w8 = NeoT8$avg_logFC < 0 & NeoT8$p_val_adj < 0.05
Lowery_negative = list(CD8 = NeoT8$Gene[w8])
sapply(Lowery_negative, length)## CD8
## 99
Next check the signatures from other studies
sigs = read_excel("../../data/Lowery_2022/science.abl5447_table_s4.xlsx",
sheet="Published signatures")
dim(sigs)## [1] 108 107
sigs[1:2,]## # A tibble: 2 × 107
## Krishna.ACT.Stem.Like Krishna.ACT.Term.Di… Li.CD8.DYS TOX_Scott Wu.8EFF Wu.8EM
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 KLF2 ENTPD1 LAG3 IGKC GNLY GZMK
## 2 RASA3 CD69 HAVCR2 AVIL NKG7 CCL4
## # … with 101 more variables: Wu.8Trm.1 <chr>, Wu.8Trm.2 <chr>, Wu.8Trm.3 <chr>,
## # Wu.8chrom <chr>, Wu.8Mit <chr>, Wu.8KLRB1 <chr>, Oh.CD8.CD39 <chr>,
## # Oh.CD8.Naive <chr>, Oh.CD8.HSP <chr>, Oh.CD8.MAIT <chr>,
## # Oh.CD8.FGFBP2 <chr>, Oh.CD8.RPL <chr>, Oh.CD8.MITO <chr>, Oh.CD8.XCL <chr>,
## # Oh.CD8.CM <chr>, Oh.CD8.PRO <chr>, Mel_Exhaust_Tirosh <chr>,
## # CD8_G_Feldman <chr>, CD8_B_Feldman <chr>, Exhaust_1_Feldman <chr>,
## # Exhaust_2_Feldman <chr>, Exhaust_3_Feldman <chr>, …
aucs = read_excel("../../data/Lowery_2022/science.abl5447_table_s12.xlsx")
dim(aucs)## [1] 108 6
aucs[1:2,]## # A tibble: 2 × 6
## `Gene Signatures` `CD4 training` `CD8 training` `CD4 validation`
## <chr> <dbl> <dbl> <dbl>
## 1 B16_PROG.EX_Miller..40g. 0.404 0.345 0.384
## 2 B16_TERMINAL.EX_Miller..27g. 0.486 0.75 0.484
## # … with 2 more variables: CD8 validation <dbl>, Public viral TCRs <dbl>
order.train = order(aucs$`CD8 training`, decreasing=TRUE)[1:5]
order.valid = order(aucs$`CD8 validation`, decreasing=TRUE)[1:5]
aucs[order.train,]## # A tibble: 5 × 6
## `Gene Signatures` `CD4 training` `CD8 training` `CD4 validation`
## <chr> <dbl> <dbl> <dbl>
## 1 NeoTCR8.ALL..243g. 0.639 0.971 0.634
## 2 Oliveira.TTE..100g. 0.666 0.95 0.656
## 3 Yost_CD8.Exh..100g. 0.708 0.936 0.714
## 4 Oliveira.Tumor.spec.TTE..96g. 0.577 0.924 0.562
## 5 Oh.CD8.CD39..45g. 0.684 0.922 0.744
## # … with 2 more variables: CD8 validation <dbl>, Public viral TCRs <dbl>
aucs[order.valid,]## # A tibble: 5 × 6
## `Gene Signatures` `CD4 training` `CD8 training` `CD4 validation`
## <chr> <dbl> <dbl> <dbl>
## 1 NeoTCR8.ALL..243g. 0.639 0.971 0.634
## 2 Oliveira.TTE..100g. 0.666 0.95 0.656
## 3 Yost_CD8.Exh..100g. 0.708 0.936 0.714
## 4 Cluster.6..50g. 0.64 0.908 0.634
## 5 Oh.CD8.CD39..45g. 0.684 0.922 0.744
## # … with 2 more variables: CD8 validation <dbl>, Public viral TCRs <dbl>
CD8.sigs = aucs$`Gene Signatures`[order.train[1:3]]
CD8.sigs## [1] "NeoTCR8.ALL..243g." "Oliveira.TTE..100g." "Yost_CD8.Exh..100g."
sigs_CD8 = list(Lowery.CD8.neg.99g = Lowery_negative[["CD8"]],
Oliveira.TTE.100g = sigs[["Oliveira.TTE"]],
Yost_CD8.Exh.100g = sigs[["Yost_CD8 Exh"]],
Lowery.CD8.243g = Lowery$CD8)
sigs_CD8 = clean_list(sigs_CD8)
sapply(sigs_CD8, length)## Lowery.CD8.neg.99g Oliveira.TTE.100g Yost_CD8.Exh.100g Lowery.CD8.243g
## 99 100 100 243
cd8T_files = list.files(data_dir, pattern="10X.CD8")
cd8T_files## [1] "GSE156728_BC_10X.CD8.counts.txt.gz"
## [2] "GSE156728_BCL_10X.CD8.counts.txt.gz"
## [3] "GSE156728_ESCA_10X.CD8.counts.txt.gz"
## [4] "GSE156728_MM_10X.CD8.counts.txt.gz"
## [5] "GSE156728_PACA_10X.CD8.counts.txt.gz"
## [6] "GSE156728_RC_10X.CD8.counts.txt.gz"
## [7] "GSE156728_THCA_10X.CD8.counts.txt.gz"
## [8] "GSE156728_UCEC_10X.CD8.counts.txt.gz"
for(f1 in cd8T_files){
d1 = fread(file.path(data_dir, f1))
stopifnot(anyDuplicated(names(d1)) == 0)
print(sprintf("file: %s, dimension: (%d,%d)", f1, nrow(d1), ncol(d1)))
Tcells = which(names(d1) %in% tcr_beta$barcode)
dT = data.matrix(d1[,..Tcells])
rownames(dT) = d1$V1
print(sprintf("%d/%d cells with TCR", length(Tcells), ncol(d1)))
gs = list()
for(s1 in names(sigs_CD8)){
match.gene = na.omit(match(sigs_CD8[[s1]], d1$V1))
print(sprintf("%s: %d/%d genes are found.",
s1, length(match.gene), length(sigs_CD8[[s1]])))
gs[[s1]] = d1$V1[match.gene]
}
sce = SingleCellExperiment(assays=list(counts=dT))
sce = logNormCounts(sce)
sce
gsva.es = gsva(counts(sce), gs, method="ssgsea", verbose=FALSE)
stopifnot(all(colnames(gsva.es) == colnames(sce)))
# some signature genes appear multiple times
sig_genes = unlist(gs[-1])
print("the frequency of sigature genes:")
print(table(table(sig_genes)))
mat1 = match(sig_genes, rownames(sce))
stopifnot(all(! is.na(mat1)))
mat2 = match(Hanada$CD8[which(Hanada$CD8_sign == "+")], rownames(sce))
mat2 = na.omit(mat2)
mat3 = match(Hanada$CD8[which(Hanada$CD8_sign == "-")], rownames(sce))
mat3 = na.omit(mat3)
mat4 = match(Oliveira_tumor, rownames(sce))
mat4 = na.omit(mat4)
mat5 = match(Oliveira_virus, rownames(sce))
mat5 = na.omit(mat5)
print(sprintf("%d/%d genes are found for Hanada postive/negative signatures",
length(mat2), length(mat3)))
print(sprintf("%d/%d genes are found for Oliveira postive/negative signatures",
length(mat4), length(mat5)))
ave_combined = colMeans(logcounts(sce)[mat1,])
ave_Hanada = colMeans(logcounts(sce)[mat2,])
ave_Hanada_neg = colMeans(logcounts(sce)[mat3,])
ave_Oliveira = colMeans(logcounts(sce)[mat4,])
ave_Oliveira_neg = colMeans(logcounts(sce)[mat5,])
sigs = data.frame(ave_Hanada_neg, ave_Oliveira_neg, t(gsva.es)[,1],
ave_Hanada, ave_Oliveira, ave_combined, t(gsva.es)[,-1])
names(sigs)[3] = rownames(gsva.es)[1]
dim(sigs)
sigs[1:2,]
sigs = scale(sigs)
gc = ggcorrplot(cor(sigs), lab = TRUE)
print(gc)
unq_genes = unique(c(unlist(gs), rownames(sce)[mat2], rownames(sce)[mat3],
rownames(sce)[mat4], rownames(sce)[mat5]))
print("number of genes used for PCA:")
print(length(unq_genes))
stopifnot(all(unq_genes %in% d1$V1))
match.gene = match(unq_genes, d1$V1)
sce = runPCA(sce, ncomponents=20, subset_row=match.gene)
set.seed(100100)
sce = runUMAP(sce, dimred="PCA")
clust = clusterCells(sce, use.dimred="PCA",
BLUSPARAM=NNGraphParam(cluster.fun="louvain"))
colData(sce)$louvain = clust
default_palette = scales::hue_pal()(length(unique(clust)))
g1 = plotUMAP(sce, colour_by=I(colData(sce)$louvain),
point_size=0.6) +
guides(color = guide_legend(override.aes = list(size = 2))) +
scale_color_manual(values = default_palette)
print(g1)
colData(sce)[["ssGSEA"]] = colMeans(gsva.es[-1,])
colData(sce)[["positive"]] = rowMeans(sigs[,-(1:3)])
colData(sce)[["negative"]] = rowMeans(sigs[,(1:3)])
colData(sce) = cbind(colData(sce), reducedDim(sce, "UMAP"))
df = as.data.frame(colData(sce))
stopifnot(ncol(df) == 7)
names(df)[6:7] = c("UMAP1", "UMAP2")
g2 = ggplot(df, aes(x=louvain, y=ssGSEA, fill=louvain)) +
geom_boxplot() +
scale_fill_manual(values = default_palette)
g3 = ggplot(df, aes(x=louvain, y=positive, fill=louvain)) +
geom_boxplot() +
scale_color_manual(values = default_palette)
g4 = ggplot(df, aes(x=louvain, y=negative, fill=louvain)) +
geom_boxplot() +
scale_color_manual(values = default_palette)
ga1 = ggarrange(g2, g3, g4, ncol = 3, nrow = 1, legend="top",
common.legend=TRUE)
print(ga1)
ssGSEA_med = tapply(df$ssGSEA, df$louvain, median)
df2 = data.frame(louvain = as.factor(as.numeric(names(ssGSEA_med))),
ssGSEA = ssGSEA_med,
positive = tapply(df$positive, df$louvain, median),
negative = tapply(df$negative, df$louvain, median))
g5 = ggplot(df2, aes(x=ssGSEA, y=positive, color=louvain)) +
geom_point(size=3) +
scale_color_manual(values = default_palette)
g6 = ggplot(df2, aes(x=positive, y=negative, color=louvain)) +
geom_point(size=3) +
scale_color_manual(values = default_palette)
ga2 = ggarrange(g5, g6, ncol = 2, nrow = 1, legend="top",
common.legend=TRUE)
print(ga2)
fnm = sprintf("../../data/Zheng_2021_processed/%s.txt",
gsub(".counts.txt.gz", "", f1))
df$cell = colnames(sce)
fwrite(df, file=fnm, sep="\t")
system(sprintf("gzip -f %s", fnm))
print(sprintf("Done with %s", f1))
print(date())
print("------------------------------------------------------------------")
}## [1] "file: GSE156728_BC_10X.CD8.counts.txt.gz, dimension: (24148,4292)"
## [1] "3332/4292 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 6427 genes with constant expression
## values throuhgout the samples.
## [1] "the frequency of sigature genes:"
##
## 1 2 3
## 253 59 22
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## [1] "Done with GSE156728_BC_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:45:06 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_BCL_10X.CD8.counts.txt.gz, dimension: (28855,3483)"
## [1] "2629/3483 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 11637 genes with constant expression
## values throuhgout the samples.
## [1] "the frequency of sigature genes:"
##
## 1 2 3
## 253 59 22
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## [1] "Done with GSE156728_BCL_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:45:56 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_ESCA_10X.CD8.counts.txt.gz, dimension: (24148,12527)"
## [1] "8623/12527 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 4486 genes with constant expression
## values throuhgout the samples.
## [1] "the frequency of sigature genes:"
##
## 1 2 3
## 253 59 22
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## [1] "Done with GSE156728_ESCA_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:48:15 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_MM_10X.CD8.counts.txt.gz, dimension: (28855,8630)"
## [1] "6533/8630 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 9709 genes with constant expression
## values throuhgout the samples.
## [1] "the frequency of sigature genes:"
##
## 1 2 3
## 253 59 22
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## [1] "Done with GSE156728_MM_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:50:24 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_PACA_10X.CD8.counts.txt.gz, dimension: (24148,5958)"
## [1] "4769/5958 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 5608 genes with constant expression
## values throuhgout the samples.
## [1] "the frequency of sigature genes:"
##
## 1 2 3
## 253 59 22
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## [1] "Done with GSE156728_PACA_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:51:35 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_RC_10X.CD8.counts.txt.gz, dimension: (24148,16545)"
## [1] "12835/16545 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 4390 genes with constant expression
## values throuhgout the samples.
## [1] "the frequency of sigature genes:"
##
## 1 2 3
## 253 59 22
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## [1] "Done with GSE156728_RC_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:54:57 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_THCA_10X.CD8.counts.txt.gz, dimension: (24148,33451)"
## [1] "25813/33451 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 2883 genes with constant expression
## values throuhgout the samples.
## [1] "the frequency of sigature genes:"
##
## 1 2 3
## 253 59 22
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## [1] "Done with GSE156728_THCA_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 17:02:51 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_UCEC_10X.CD8.counts.txt.gz, dimension: (24148,19927)"
## [1] "15449/19927 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 3837 genes with constant expression
## values throuhgout the samples.
## [1] "the frequency of sigature genes:"
##
## 1 2 3
## 253 59 22
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.
## [1] "Done with GSE156728_UCEC_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 17:07:16 2022"
## [1] "------------------------------------------------------------------"
gc()## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 9651986 515.5 17420541 930.4 NA 17420541 930.4
## Vcells 961131785 7332.9 2909303418 22196.3 32768 3636625933 27745.3
sessionInfo()## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggcorrplot_0.1.3 Rtsne_0.15
## [3] svd_0.5 bluster_1.4.0
## [5] cluster_2.1.2 scran_1.22.1
## [7] scater_1.22.0 scuttle_1.4.0
## [9] SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0
## [11] Biobase_2.54.0 GenomicRanges_1.46.1
## [13] GenomeInfoDb_1.30.0 IRanges_2.28.0
## [15] S4Vectors_0.32.3 BiocGenerics_0.40.0
## [17] MatrixGenerics_1.6.0 matrixStats_0.61.0
## [19] GSVA_1.42.0 readxl_1.3.1
## [21] stringr_1.4.0 ggpubr_0.4.0
## [23] ggplot2_3.3.5 Matrix_1.3-4
## [25] data.table_1.14.2
##
## loaded via a namespace (and not attached):
## [1] backports_1.4.0 plyr_1.8.6
## [3] igraph_1.2.9 GSEABase_1.56.0
## [5] BiocParallel_1.28.2 digest_0.6.29
## [7] htmltools_0.5.2 viridis_0.6.2
## [9] fansi_0.5.0 magrittr_2.0.1
## [11] memoise_2.0.1 ScaledMatrix_1.2.0
## [13] limma_3.50.0 Biostrings_2.62.0
## [15] annotate_1.72.0 R.utils_2.11.0
## [17] colorspace_2.0-2 blob_1.2.2
## [19] ggrepel_0.9.1 xfun_0.28
## [21] dplyr_1.0.7 crayon_1.4.2
## [23] RCurl_1.98-1.5 jsonlite_1.7.2
## [25] graph_1.72.0 glue_1.5.1
## [27] gtable_0.3.0 zlibbioc_1.40.0
## [29] XVector_0.34.0 DelayedArray_0.20.0
## [31] car_3.0-12 BiocSingular_1.10.0
## [33] Rhdf5lib_1.16.0 HDF5Array_1.22.1
## [35] abind_1.4-5 scales_1.1.1
## [37] DBI_1.1.1 edgeR_3.36.0
## [39] rstatix_0.7.0 Rcpp_1.0.7
## [41] viridisLite_0.4.0 xtable_1.8-4
## [43] dqrng_0.3.0 bit_4.0.4
## [45] rsvd_1.0.5 metapod_1.2.0
## [47] httr_1.4.2 FNN_1.1.3
## [49] ellipsis_0.3.2 pkgconfig_2.0.3
## [51] XML_3.99-0.8 R.methodsS3_1.8.1
## [53] farver_2.1.0 uwot_0.1.10
## [55] sass_0.4.0 locfit_1.5-9.4
## [57] utf8_1.2.2 tidyselect_1.1.1
## [59] labeling_0.4.2 rlang_0.4.12
## [61] reshape2_1.4.4 AnnotationDbi_1.56.2
## [63] munsell_0.5.0 cellranger_1.1.0
## [65] tools_4.1.2 cachem_1.0.6
## [67] cli_3.1.0 generics_0.1.1
## [69] RSQLite_2.2.8 broom_0.7.10
## [71] evaluate_0.14 fastmap_1.1.0
## [73] yaml_2.2.1 knitr_1.36
## [75] bit64_4.0.5 purrr_0.3.4
## [77] KEGGREST_1.34.0 sparseMatrixStats_1.6.0
## [79] R.oo_1.24.0 compiler_4.1.2
## [81] rstudioapi_0.13 beeswarm_0.4.0
## [83] png_0.1-7 ggsignif_0.6.3
## [85] tibble_3.1.6 statmod_1.4.36
## [87] bslib_0.3.1 stringi_1.7.6
## [89] highr_0.9 RSpectra_0.16-0
## [91] lattice_0.20-45 vctrs_0.3.8
## [93] pillar_1.6.4 lifecycle_1.0.1
## [95] rhdf5filters_1.6.0 jquerylib_0.1.4
## [97] RcppAnnoy_0.0.19 BiocNeighbors_1.12.0
## [99] cowplot_1.1.1 bitops_1.0-7
## [101] irlba_2.3.3 R6_2.5.1
## [103] gridExtra_2.3 vipor_0.4.5
## [105] codetools_0.2-18 assertthat_0.2.1
## [107] rhdf5_2.38.0 withr_2.4.3
## [109] GenomeInfoDbData_1.2.7 parallel_4.1.2
## [111] grid_4.1.2 beachmat_2.10.0
## [113] tidyr_1.1.4 rmarkdown_2.11
## [115] DelayedMatrixStats_1.16.0 carData_3.0-4
## [117] ggbeeswarm_0.6.0